/*
  author Sylvain Bertrand <sylvain.bertrand@gmail.com>
  Protected by linux GNU GPLv2
  Copyright 2012-2014
*/
#include <linux/pci.h>
#include <linux/cdev.h>
#include <linux/delay.h>

#include <alga/rng_mng.h>
#include <alga/timing.h>
#include <uapi/alga/pixel_fmts.h>
#include <uapi/alga/amd/dce6/dce6.h>

#include "mc.h"
#include "rlc.h"
#include "ih.h"
#include "fence.h"
#include "ring.h"
#define DMAS_C
#include "dmas.h"
#undef DMAS_C
#include "ba.h"
#include "cps.h"
#include "gpu.h"
#include "drv.h"

#include "regs.h"

char *dmas_str[DMAS_N] = {
	"dma0",
	"dma1"
};

static void resume(struct pci_dev *dev, u8 dma)
{
	struct dev_drv_data *dd;
	u32 rb_ctl;
	u64 wb_gpu_addr;
	u32 dma_ctl;

	dd = pci_get_drvdata(dev);

	fence_init(&dd->dmas[dma].fence, dd->ba.wb_map->cpu_addr
							+ wb_dma_fence_of[dma]);

	wr32(dev, dd->addr_cfg, regs_dma_tiling_cfg[dma]);
	
	wr32(dev, 0, regs_dma_sem_incomplete_timer_ctl[dma]);
	wr32(dev, 0, regs_dma_sem_wait_fail_timer_ctl[dma]);

	/* do the ring buffer ctl init in 2 times, follow upstream */
	rb_ctl = set(DRC_RB_SZ, DMA_RING_LOG2_DWS);
	wr32(dev, rb_ctl, regs_dma_rb_ctl[dma]);

	wr32(dev, 0, regs_dma_rb_rptr[dma]);
	wr32(dev, 0, regs_dma_rb_wptr[dma]);

	wb_gpu_addr = dd->ba.wb_map->gpu_addr + wb_dma_rptr_of[dma];

	/* must be dw aligned */
	wr32(dev, upper_32_bits(wb_gpu_addr), regs_dma_rb_rptr_addr_hi[dma]);	
	wr32(dev, lower_32_bits(wb_gpu_addr), regs_dma_rb_rptr_addr_lo[dma]);	

	/*
	 * alignment is fine since aperture allocation are GPU_PAGE_SZ
	 * aligned
	 */
	wr32(dev, dd->ba.dma_rings_maps[dma]->gpu_addr >> 8,
							regs_dma_rb_base[dma]);

	wr32(dev, DIC_DMA_IB_ENA | DIC_CMD_VMID_FORCE, regs_dma_ib_ctl[dma]);

	dma_ctl = rr32(dev, regs_dma_ctl[dma]);
	dma_ctl &= ~DC_CTX_EMPTY_INT_ENA;
	wr32(dev, dma_ctl, regs_dma_ctl[dma]);

	dd->dmas[dma].wptr = 0;
	wr32(dev, 0, regs_dma_rb_wptr[dma]);
	wr32(dev, 0, regs_dma_rb_rptr[dma]);

	/* 2nd part of ring buffer ctl init */
	rb_ctl |= DRC_RPTR_WRBACK_ENA | DRC_RB_ENA;
	wr32(dev, rb_ctl, regs_dma_rb_ctl[dma]);
}

static void dma_stop(struct pci_dev *dev, u8 dma)
{
	u32 rb_ctl;
	rb_ctl = rr32(dev, regs_dma_rb_ctl[dma]);
	rb_ctl &= ~DRC_RB_ENA;
	wr32(dev, rb_ctl, regs_dma_rb_ctl[dma]);
}

void dmas_stop(struct pci_dev *dev)
{
	u8 dma;

	for (dma = 0; dma < DMAS_N; ++dma)
		dma_stop(dev, dma);
}

void dmas_resume(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u8 dma;

	dd = pci_get_drvdata(dev);

	wr32(dev, SSR_SOFT_RESET_DMA0 | SSR_SOFT_RESET_DMA1, SRBM_SOFT_RESET);
	rr32(dev, SRBM_SOFT_RESET);
	udelay(50);
	wr32(dev, 0, SRBM_SOFT_RESET);

	for (dma = 0; dma < DMAS_N; ++dma)
		resume(dev, dma);

}

static void intr_ena(struct pci_dev *dev, u8 dma)
{
	u32 dma_ctl;
	dma_ctl = rr32(dev, regs_dma_ctl[dma]);
	dma_ctl |= DC_TRAP_ENA;
	wr32(dev, dma_ctl, regs_dma_ctl[dma]);
}

void dmas_intr_ena(struct pci_dev *dev)
{
	u8 dma;

	for (dma = 0; dma < DMAS_N; ++dma)
		intr_ena(dev, dma);
}

static void intr_reset(struct pci_dev *dev, u8 dma)
{
	u32 dma_ctl;
	dma_ctl = rr32(dev, regs_dma_ctl[dma]);
	dma_ctl &= ~DC_TRAP_ENA;
	wr32(dev, dma_ctl, regs_dma_ctl[dma]);
}

void dmas_intr_reset(struct pci_dev *dev)
{
	u8 dma;

	for (dma = 0; dma < DMAS_N; ++dma)
		intr_reset(dev, dma);
}

/*----------------------------------------------------------------------------*/
static u32 rptr_dw_get(struct pci_dev *dev, u8 dma)
{
	struct dev_drv_data *dd;
	u32 rptr_byte;

	dd = pci_get_drvdata(dev);

	rptr_byte = le32_to_cpup(dd->ba.wb_map->cpu_addr + wb_dma_rptr_of[dma]);
	return rptr_byte >> 2;
}

static u32 dma0_rptr_dw_get(struct pci_dev *dev)
{
	return rptr_dw_get(dev, 0);
}

static u32 dma1_rptr_dw_get(struct pci_dev *dev)
{
	return rptr_dw_get(dev, 1);
}

static u32 (*dmas_rptr_dw_get[DMAS_N])(struct pci_dev *dev) = {
	dma0_rptr_dw_get,
	dma1_rptr_dw_get
};
/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/
static u32 wptr_dw_get(struct pci_dev *dev, u8 dma)
{
	struct dev_drv_data *dd;
	u32 wptr_byte;

	dd = pci_get_drvdata(dev);

	wptr_byte = dd->dmas[dma].wptr;
	return wptr_byte >> 2;
}

static u32 dma0_wptr_dw_get(struct pci_dev *dev)
{
	return wptr_dw_get(dev, 0);
}

static u32 dma1_wptr_dw_get(struct pci_dev *dev)
{
	return wptr_dw_get(dev, 1);
}

static u32 (*dmas_wptr_dw_get[DMAS_N])(struct pci_dev *dev) = {
	dma0_wptr_dw_get,
	dma1_wptr_dw_get
};
/*----------------------------------------------------------------------------*/

void init_once(struct pci_dev *dev, u8 dma)
{
	struct dev_drv_data *dd;
	struct dma *d;

	dd = pci_get_drvdata(dev);
	d = &dd->dmas[dma];

	spin_lock_init(&d->lock);

	fence_init_once(&d->fence);

	d->ring.dev = dev;
	d->ring.pf_dw_mask = DMA_RING_PF_DW_MASK;
	d->ring.ring_dws_n = 1 << DMA_RING_LOG2_DWS;
	d->ring.rptr_dw_get = dmas_rptr_dw_get[dma];
	d->ring.wptr_dw_get = dmas_wptr_dw_get[dma];
}

void dmas_init_once(struct pci_dev *dev)
{
	u8 dma;

	for (dma = 0; dma < DMAS_N; ++dma)
		init_once(dev, dma);
}

void dma_wr(struct pci_dev *dev, u8 dma, u32 v)
{
	struct dev_drv_data *dd;
	u32 __iomem *r;

	dd = pci_get_drvdata(dev);

	r = dd->ba.dma_rings_maps[dma]->cpu_addr;
	r[dd->dmas[dma].wptr >> 2] = cpu_to_le32(v); /* wptr is dword aligned */
	dd->dmas[dma].wptr += 4;
	dd->dmas[dma].wptr &= DMA_RING_BYTE_MASK;
}

void dma_commit(struct pci_dev *dev, u8 dma)
{
	struct dev_drv_data *dd;
	u32 __iomem *r;

	dd = pci_get_drvdata(dev);

	/* match ring fetch alignment */
	r = dd->ba.dma_rings_maps[dma]->cpu_addr;
	while ((dd->dmas[dma].wptr >> 2) & DMA_RING_PF_DW_MASK) {
		r[dd->dmas[dma].wptr >> 2] =
				cpu_to_le32(DMA_PKT(DMA_PKT_NOP, 0, 0, 0, 0));
		dd->dmas[dma].wptr += 4;
	}

	wmb();	/* data write operations emitted before dma */

	dd->dmas[dma].wptr &= DMA_RING_BYTE_MASK;
	wr32(dev, dd->dmas[dma].wptr, regs_dma_rb_wptr[dma]);
	rr32(dev, regs_dma_rb_wptr[dma]);
}

u8 dmas_select(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	int selector;

	dd = pci_get_drvdata(dev);
	selector = atomic_inc_return(&dd->dmas_selector);
	return selector & 1;
}

/*
 * compute the number of dma_pkt_cpy blocks required to perform the dma blit
 * and keep track of the last block dma size
 */
u32 dma_pkt_cpy_blks_count(u64 sz, u32 *last_blk_sz)
{
	u32 blks_n;
	u64 blk_sz;

	blks_n = 0;
	blk_sz = 0;
	while (sz) {	
		blk_sz = sz;

		if (blk_sz > DMA_PKT_SZ_MAX)
			blk_sz = DMA_PKT_SZ_MAX;

		sz -= blk_sz;

		++blks_n;
	}
	*last_blk_sz = blk_sz;
	return blks_n;
}

/*
 * compute the number of dma_u32_fill_cpy blocks required to perform the dma 
 * fill and keep track of the last block dws count 
 */
static u32 dma_pkt_u32_fill_blks_count(u64 dws_n, u32 *last_blk_dws_n)
{
	u32 blks_n;
	u64 blk_dws_n;

	blks_n = 0;
	blk_dws_n = 0;
	while (dws_n) {	
		blk_dws_n = dws_n;

		if (blk_dws_n > DMA_PKT_SZ_MAX)
			blk_dws_n = DMA_PKT_SZ_MAX;

		dws_n -= blk_dws_n;

		++blks_n;
	}
	*last_blk_dws_n = blk_dws_n;
	return blks_n;
}

#define FENCE_DWS_N 8
static u32 fence_wr(struct pci_dev *dev, u8 dma)
{
	struct dev_drv_data *dd;
	u64 wb_fence_gpu_addr;
	u32 fence_seq_n;

	dd = pci_get_drvdata(dev);

	/* write the fence, gpu addr must be dw aligned */
	wb_fence_gpu_addr = dd->ba.wb_map->gpu_addr + wb_dma_fence_of[dma];
	dma_wr(dev, dma, DMA_PKT(DMA_PKT_FENCE, 0, 0, 0, 0));
	dma_wr(dev, dma, lower_32_bits(wb_fence_gpu_addr));
	dma_wr(dev, dma, upper_32_bits(wb_fence_gpu_addr));

	fence_seq_n = fence_seq_n_get(&dd->dmas[dma].fence);

	dma_wr(dev, dma, fence_seq_n);

	/* generate an interrupt */
	dma_wr(dev, dma, DMA_PKT(DMA_PKT_TRAP, 0, 0, 0, 0));

	/* flush hdp cache */
	dma_wr(dev, dma, DMA_PKT(DMA_PKT_SRBM_WR, 0, 0, 0, 0));
	dma_wr(dev, dma, (0xf << 16) | (HDP_MEM_COHERENCY_FLUSH_CTL >> 2));
	dma_wr(dev, dma, 1);
	return fence_seq_n;
}

void dma_pkt_cpy_blks_wr(struct pci_dev *dev, u8 dma, u64 dst, u64 src,
				u32 dma_pkt_cpy_blks_n, u64 last_blk_dma_sz)
{
	while (dma_pkt_cpy_blks_n--) {
		u64 blk_dma_sz;
		if (dma_pkt_cpy_blks_n == 0)
			blk_dma_sz = last_blk_dma_sz;
		else
			blk_dma_sz = DMA_PKT_SZ_MAX;

		dma_wr(dev, dma, DMA_PKT(DMA_PKT_CPY, 1, 0, 0, blk_dma_sz));
		dma_wr(dev, dma, lower_32_bits(dst));
		dma_wr(dev, dma, lower_32_bits(src));
		dma_wr(dev, dma, upper_32_bits(dst));
		dma_wr(dev, dma, upper_32_bits(src));
		dst += blk_dma_sz;
		src += blk_dma_sz;
	}
}

#define DMA_PKT_U32_FILL_BLK_DWS_N 4
static void dma_pkt_u32_fill_blks_wr(struct pci_dev *dev, u8 dma, u64 dst,
		u32 constant, u32 dma_pkt_u32_fill_blks_n, u64 last_blk_dws_n)
{
	while (dma_pkt_u32_fill_blks_n--) {
		u64 blk_dws_n;
		if (dma_pkt_u32_fill_blks_n == 0)
			blk_dws_n = last_blk_dws_n;
		else
			blk_dws_n = DMA_PKT_SZ_MAX;

		dma_wr(dev, dma, DMA_PKT(DMA_PKT_U32_FILL, 0, 0, 0, blk_dws_n));
		dma_wr(dev, dma, lower_32_bits(dst));
		dma_wr(dev, dma, constant);
		dma_wr(dev, dma, upper_32_bits(dst));
		dst += blk_dws_n << 2;
	}
}

long dmas_cpy(struct pci_dev *dev, u64 dst, u64 src, u64 sz,
					struct dmas_timeouts_info t_info)
{
	u32 dma_pkt_cpy_blks_n;		/* pre-computed */
	u32 last_blk_dma_sz;		/* idem */
	u32 dma_pkt_cpy_blks_dws_n;	/* idem */
	u32 fence_seq_n;
	u8 dma;
	struct dev_drv_data *dd;
	long r;

	dma_pkt_cpy_blks_n = dma_pkt_cpy_blks_count(sz, &last_blk_dma_sz);
	dma_pkt_cpy_blks_dws_n = dma_pkt_cpy_blks_n * DMA_PKT_CPY_BLK_DWS_N;

	dma = dmas_select(dev);

	dd = pci_get_drvdata(dev);

	spin_lock(&dd->dmas[dma].lock);
	r = ring_wait(&dd->dmas[dma].ring, dma_pkt_cpy_blks_dws_n
			+ FENCE_DWS_N, t_info.ring.n_max, t_info.ring.us);
	if (r == RING_WAIT_TIMEOUT) {
		spin_unlock(&dd->dmas[dma].lock);
		return -DMAS_RING_TIMEOUT;
	}
	dma_pkt_cpy_blks_wr(dev, dma, dst, src, dma_pkt_cpy_blks_n,
							last_blk_dma_sz);
	fence_seq_n = fence_wr(dev, dma);
	dma_commit(dev, dma);
	
	spin_unlock(&dd->dmas[dma].lock);
	
	r = fence_wait(&dd->dmas[dma].fence, fence_seq_n, t_info.fence.n_max,
							t_info.fence.us);
	if (r == FENCE_TIMEOUT) 
		return -DMAS_FENCE_TIMEOUT;
	return 0;
}

long dmas_u32_fill(struct pci_dev *dev, u64 dst, u64 dws_n, u32 constant,
					struct dmas_timeouts_info t_info)
{
	u32 dma_pkt_u32_fill_blks_n;		/* pre-computed */
	u32 last_blk_u32_fill_dws_n;		/* idem */
	u32 dma_pkt_u32_fill_blks_dws_n;	/* idem */
	u32 fence_seq_n;
	u8 dma;
	struct dev_drv_data *dd;
	long r;

	dma_pkt_u32_fill_blks_n = dma_pkt_u32_fill_blks_count(dws_n,
						&last_blk_u32_fill_dws_n);
	dma_pkt_u32_fill_blks_dws_n = dma_pkt_u32_fill_blks_n
						* DMA_PKT_U32_FILL_BLK_DWS_N;

	dma = dmas_select(dev);

	dd = pci_get_drvdata(dev);

	spin_lock(&dd->dmas[dma].lock);
	r = ring_wait(&dd->dmas[dma].ring, dma_pkt_u32_fill_blks_dws_n
			+ FENCE_DWS_N, t_info.ring.n_max, t_info.ring.us);
	if (r == -RING_WAIT_TIMEOUT) {
		spin_unlock(&dd->dmas[dma].lock);
		return -DMAS_RING_TIMEOUT;
	}
	dma_pkt_u32_fill_blks_wr(dev, dma, dst, constant,
			dma_pkt_u32_fill_blks_n, last_blk_u32_fill_dws_n);
	fence_seq_n = fence_wr(dev, dma);
	dma_commit(dev, dma);
	
	spin_unlock(&dd->dmas[dma].lock);

	r = fence_wait(&dd->dmas[dma].fence, fence_seq_n, t_info.fence.n_max,
							t_info.fence.us);
	if (r == -FENCE_TIMEOUT) {
		return -DMAS_FENCE_TIMEOUT;
	}
	return 0;
}
#undef DMA_PKT_U32_FILL_BLK_DWS_N
#undef FENCE_DWS_N


static void dma_mgcg_dis(struct pci_dev *dev, u8 dma)
{
	u32 cur;
	u32 want;

	cur = rr32(dev, regs_dma_pwr_ctl[dma]);
	want = cur | DPC_MEM_PWR_OVERRIDE;
	if (cur != want)
		wr32(dev, want, regs_dma_pwr_ctl[dma]);

	cur = rr32(dev, regs_dma_clk_ctl[dma]);
	want = 0xff000000;
	if (cur != want)
		wr32(dev, want, regs_dma_clk_ctl[dma]);
}

static void dma_mgcg_ena(struct pci_dev *dev, u8 dma)
{
	u32 cur;
	u32 want;

	cur = rr32(dev, regs_dma_pwr_ctl[dma]);
	want = cur & ~DPC_MEM_PWR_OVERRIDE;
	if (cur != want)
		wr32(dev, want, regs_dma_pwr_ctl[dma]);
	wr32(dev, 0x00000100, regs_dma_clk_ctl[dma]);
}

void dmas_mgcg_dis(struct pci_dev *dev)
{
	u8 dma;

	for (dma = 0; dma < DMAS_N; ++dma)
		dma_mgcg_dis(dev, dma);
}

void dmas_mgcg_ena(struct pci_dev *dev)
{
	u8 dma;

	for (dma = 0; dma < DMAS_N; ++dma)
		dma_mgcg_ena(dev, dma);
}
